{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "0fd5b6f7-4abf-45f0-9864-ba019a60ea3e",
   "metadata": {},
   "source": [
    "### gwRVIS scores for misexpression-associated SVs "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "27076400-7580-43b3-90b8-cd79535ed3f0",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd \n",
    "import numpy as np\n",
    "from io import StringIO\n",
    "from pybedtools import BedTool\n",
    "from pathlib import Path"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "0c51857e",
   "metadata": {},
   "outputs": [],
   "source": [
    "# inputs \n",
    "chrom = \"chr21\"\n",
    "data_type=\"gwrvis\"\n",
    "\n",
    "wkdir = \"/lustre/scratch126/humgen/projects/interval_rna/interval_rna_seq/thomasVDS/misexpression_v3\"\n",
    "wkdir_path = Path(wkdir)\n",
    "\n",
    "sv_bed_path = wkdir_path.joinpath(\"5_misexp_vrnts/test_cntrl_sets/vrnt_id_in_windows_chr_num_misexp_genes.bed\")\n",
    "sv_info_path = \"/lustre/scratch126/humgen/projects/interval_rna/interval_rna_seq/thomasVDS/lof_missense/data/sv_vcf/info_table/final_sites_critical_info_allele.txt\"\n",
    "gwrvis_path = f\"/lustre/scratch125/humgen/resources/JARVIS-gwRVIS-scores/hg38/gwRVIS/gwrvis_single_nt.{chrom}.hg38.bed.gz\"\n",
    "out_dir = wkdir_path.joinpath(\"5_misexp_vrnts/scores/gwrvis\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "fe4b4c12-e845-4115-8c0f-1bf05b552a95",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Scoring SVs on chr21 with gwrvis score ...\n",
      "Number of variants in windows on chr21: 311\n",
      "Number of variants with gwRVIS intersect chr21: 311\n",
      "NAs in dataframe: False\n",
      "Unique variants in final output: 311\n"
     ]
    }
   ],
   "source": [
    "out_dir_path = Path(out_dir)\n",
    "out_dir_path.mkdir(parents=True, exist_ok=True)\n",
    "\n",
    "chrom_num = int(chrom.split(\"chr\")[1])\n",
    "\n",
    "\n",
    "print(f\"Scoring SVs on {chrom} with {data_type} score ...\")\n",
    "vrnts_bed_columns = {0:\"chrom\", 1:\"start\", 2:\"end\", 3:\"vrnt_id\"}\n",
    "\n",
    "bed_intersect_cols = {0:\"vrnt_chrom\", 1:\"vrnt_start\", 2:\"vrnt_end\", 3:\"vrnt_id\", 4:f\"{data_type}_chrom\", \n",
    "                      5:f\"{data_type}_start\", 6:f\"{data_type}_end\", 7:f\"{data_type}_score\", 8:\"overlap\"}\n",
    "\n",
    "# load SV info\n",
    "sv_info_df =pd.read_csv(sv_info_path, sep=\"\\t\", dtype={\"plinkID\": str}).rename(columns={\"plinkID\":\"vrnt_id\"})\n",
    "\n",
    "# load variants in windows \n",
    "sv_bed = BedTool(sv_bed_path)\n",
    "sv_bed_sorted = sv_bed.sort()\n",
    "vrnts_in_windows_df = pd.read_csv(sv_bed_path, sep=\"\\t\", header=None, dtype={3:str}).rename(columns=vrnts_bed_columns)\n",
    "chrom_vrnts_in_windows_df = vrnts_in_windows_df[vrnts_in_windows_df.chrom == chrom_num]\n",
    "chrom_vrnts_in_windows = set(chrom_vrnts_in_windows_df.vrnt_id.unique())\n",
    "print(f\"Number of variants in windows on {chrom}: {len(chrom_vrnts_in_windows)}\")\n",
    "\n",
    "# load gwRVIS scores \n",
    "gwrvis_bed = BedTool(gwrvis_path)\n",
    "\n",
    "# extract overlapping gwRVIS \n",
    "sv_intersect_gwrvis_str = StringIO(str(sv_bed_sorted.intersect(gwrvis_bed, wo=True)))\n",
    "sv_intersect_gwrvis_df = pd.read_csv(sv_intersect_gwrvis_str, sep=\"\\t\", header=None, dtype={0: int, 3: str}).rename(columns=bed_intersect_cols)\n",
    "\n",
    "# only keep SV that intersect the correct chromosome \n",
    "# gwRVIS bed files seem to have mix of chromosomes \n",
    "sv_intersect_gwrvis_chrom_df = sv_intersect_gwrvis_df[sv_intersect_gwrvis_df.vrnt_chrom == chrom_num]\n",
    "# count variant with gwRVIS score \n",
    "sv_with_gwrvis = set(sv_intersect_gwrvis_chrom_df.vrnt_id.unique())\n",
    "print(f\"Number of variants with gwRVIS intersect {chrom}: {len(sv_with_gwrvis)}\")\n",
    "\n",
    "if len(sv_with_gwrvis) > len(chrom_vrnts_in_windows):\n",
    "    raise ValueError(\"Intersection file contains more variants than input file\")\n",
    "if len(sv_with_gwrvis - chrom_vrnts_in_windows) != 0: \n",
    "    raise ValueError(f\"Variants in intersection file not in input: {intersect_sv - chrom_vrnts_in_windows}\")\n",
    "# expand dataframe so that each base pair in SV has a single score\n",
    "sv_intersect_gwrvis_chrom_expanded_df = sv_intersect_gwrvis_chrom_df.loc[sv_intersect_gwrvis_chrom_df.index.repeat(sv_intersect_gwrvis_chrom_df.overlap)][[\"vrnt_id\", f\"{data_type}_score\"]]\n",
    "# check correct number of entries\n",
    "if sv_intersect_gwrvis_chrom_expanded_df.shape[0] != sv_intersect_gwrvis_chrom_df.overlap.sum():\n",
    "    raise ValueError(\"Different number of entries and overlapping base pairs\")\n",
    "    \n",
    "vrnts_in_windows_gwrvis_df = chrom_vrnts_in_windows_df.copy()\n",
    "# calculate mean, meadian, first quartile, third quartile and max gwRVIS per SV \n",
    "gwrvis_metrics_df = pd.DataFrame(sv_intersect_gwrvis_chrom_expanded_df.groupby(\"vrnt_id\")[\"gwrvis_score\"].mean()).reset_index().rename(columns={\"gwrvis_score\":\"mean\"})\n",
    "gwrvis_metrics_df[\"median\"] = sv_intersect_gwrvis_chrom_expanded_df.groupby(\"vrnt_id\")[\"gwrvis_score\"].median().tolist()\n",
    "gwrvis_metrics_df[\"quart1\"] = sv_intersect_gwrvis_chrom_expanded_df.groupby(\"vrnt_id\")[\"gwrvis_score\"].quantile(0.25).tolist()\n",
    "gwrvis_metrics_df[\"quart3\"] = sv_intersect_gwrvis_chrom_expanded_df.groupby(\"vrnt_id\")[\"gwrvis_score\"].quantile(0.75).tolist()\n",
    "# compute mean of 4 metrics \n",
    "gwrvis_metrics_df[\"vitsios\"] = gwrvis_metrics_df[[\"mean\", \"median\", \"quart1\", \"quart3\"]].mean(axis=1)\n",
    "gwrvis_metrics_df[\"max\"] = sv_intersect_gwrvis_chrom_expanded_df.groupby(\"vrnt_id\")[\"gwrvis_score\"].max().tolist()\n",
    "gwrvis_metrics_df[\"min\"] = sv_intersect_gwrvis_chrom_expanded_df.groupby(\"vrnt_id\")[\"gwrvis_score\"].min().tolist()\n",
    "# add metrics\n",
    "vrnts_in_windows_gwrvis_df = pd.merge(vrnts_in_windows_gwrvis_df, \n",
    "                                      gwrvis_metrics_df, \n",
    "                                      on=\"vrnt_id\", \n",
    "                                      how=\"left\")\n",
    "# add structural variant types and length\n",
    "vrnts_in_windows_gwrvis_df = pd.merge(vrnts_in_windows_gwrvis_df, \n",
    "                                      sv_info_df[[\"vrnt_id\", \"SVLEN\", \"SVTYPE\"]], \n",
    "                                      on=\"vrnt_id\", \n",
    "                                      how=\"left\")\n",
    "# check for NaNs\n",
    "print(f\"NAs in dataframe: {vrnts_in_windows_gwrvis_df.isnull().values.any()}\")\n",
    "print(f\"Unique variants in final output: {len(vrnts_in_windows_gwrvis_df.vrnt_id.unique())}\")\n",
    "# write results \n",
    "path_out = out_dir_path.joinpath(f\"{chrom}_sv_in_windows_{data_type}.tsv\")\n",
    "vrnts_in_windows_gwrvis_df.to_csv(path_out, sep=\"\\t\", index=False) "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6df7f8fa",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
